started: Alexey Larionov, 2016
last updated: Alexey Larionov, 09Feb2017

Summary

Overall, eigenvectors are calculated for 3 datasets:

This script deals with wecare-only dataset. Additionally it exports data to text files to repeat wecare-only calculations in eigenstrat.

start_section

# Time stamp
Sys.time()
## [1] "2017-02-21 10:21:14 GMT"
# Folders
setwd("/scratch/medgen/scripts/wecare_stat_01.17/scripts")
interim_data_folder <- "/scratch/medgen/scripts/wecare_stat_01.17/interim_data"

# Required libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

load_data

load(paste(interim_data_folder, "r04_filter_cases_jan2017.RData", sep="/"))

check_data

ls()
## [1] "interim_data_folder"      "wecare_nfe_exac.df"      
## [3] "wecare_nfe_genotypes.mx"  "wecare_nfe_kgen.df"      
## [5] "wecare_nfe_phenotypes.df" "wecare_nfe_variants.df"
dim(wecare_nfe_genotypes.mx)
## [1] 275516    678
class(wecare_nfe_genotypes.mx)
## [1] "matrix"
wecare_nfe_genotypes.mx[1:5,1:5]
##              HG00097 HG00099 HG00100 HG00102 HG00106
## Var000000002       0       0      NA      NA       0
## Var000000008       0      NA       0       0       0
## Var000000009       0       0       0       0       0
## Var000000012       0       0      NA       0       0
## Var000000013       0       0      NA       0       0
dim(wecare_nfe_phenotypes.df)
## [1] 678  28
str(wecare_nfe_phenotypes.df)
## 'data.frame':    678 obs. of  28 variables:
##  $ wes_id        : chr  "HG00097" "HG00099" "HG00100" "HG00102" ...
##  $ gwas_id       : chr  NA NA NA NA ...
##  $ merged_id     : chr  NA NA NA NA ...
##  $ filter        : chr  "pass" "pass" "pass" "pass" ...
##  $ cc            : num  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ setno         : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ family_history: int  NA NA NA NA NA NA NA NA NA NA ...
##  $ age_dx        : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ age_ref       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ rstime        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ eig1_gwas     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ eig2_gwas     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ eig3_gwas     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ stage         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ er            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ pr            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ hist_cat      : chr  NA NA NA NA ...
##  $ hormone       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ chemo_cat     : chr  NA NA NA NA ...
##  $ br_xray       : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ br_xray_dose  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ age_menarche  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ age_1st_ftp   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ age_menopause : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ num_preg      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ bmi_age18     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ bmi_dx        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ bmi_ref       : num  NA NA NA NA NA NA NA NA NA NA ...
wecare_nfe_phenotypes.df[1:5,1:5]
##          wes_id gwas_id merged_id filter cc
## HG00097 HG00097    <NA>      <NA>   pass -1
## HG00099 HG00099    <NA>      <NA>   pass -1
## HG00100 HG00100    <NA>      <NA>   pass -1
## HG00102 HG00102    <NA>      <NA>   pass -1
## HG00106 HG00106    <NA>      <NA>   pass -1
dim(wecare_nfe_variants.df)
## [1] 275516     23
colnames(wecare_nfe_variants.df)
##  [1] "SplitVarID"         "SYMBOL"             "TYPE"              
##  [4] "CHROM"              "POS"                "REF"               
##  [7] "ALT"                "AC"                 "AF"                
## [10] "AN"                 "Consequence"        "SIFT_call"         
## [13] "SIFT_score"         "PolyPhen_call"      "PolyPhen_score"    
## [16] "CLIN_SIG"           "cDNA_position"      "CDS_position"      
## [19] "Codons"             "Protein_position"   "Amino_acids"       
## [22] "Existing_variation" "Multiallelic"
wecare_nfe_variants.df[1:5,1:5]
##                SplitVarID    SYMBOL  TYPE CHROM    POS
## Var000000002 Var000000002 LINC00115   SNP     1 762330
## Var000000008 Var000000008    SAMD11   SNP     1 871215
## Var000000009 Var000000009    SAMD11   SNP     1 871230
## Var000000012 Var000000012    SAMD11 INDEL     1 874778
## Var000000013 Var000000013    SAMD11   SNP     1 878664
dim(wecare_nfe_kgen.df)
## [1] 275516      9
colnames(wecare_nfe_kgen.df)
## [1] "SplitVarID"  "kgen.AC"     "kgen.AN"     "kgen.AF"     "kgen.AFR_AF"
## [6] "kgen.AMR_AF" "kgen.EAS_AF" "kgen.EUR_AF" "kgen.SAS_AF"
wecare_nfe_kgen.df[1:5,1:5]
##                SplitVarID kgen.AC kgen.AN     kgen.AF kgen.AFR_AF
## Var000000002 Var000000002      NA      NA          NA          NA
## Var000000008 Var000000008      NA      NA          NA          NA
## Var000000009 Var000000009      NA      NA          NA          NA
## Var000000012 Var000000012      NA      NA          NA          NA
## Var000000013 Var000000013       2    5008 0.000399361           0
dim(wecare_nfe_exac.df)
## [1] 275516     48
colnames(wecare_nfe_exac.df)
##  [1] "SplitVarID"              "exac_non_TCGA.AF"       
##  [3] "exac_non_TCGA.AC"        "exac_non_TCGA.AN"       
##  [5] "exac_non_TCGA.AC_FEMALE" "exac_non_TCGA.AN_FEMALE"
##  [7] "exac_non_TCGA.AC_MALE"   "exac_non_TCGA.AN_MALE"  
##  [9] "exac_non_TCGA.AC_Adj"    "exac_non_TCGA.AN_Adj"   
## [11] "exac_non_TCGA.AC_Hom"    "exac_non_TCGA.AC_Het"   
## [13] "exac_non_TCGA.AC_Hemi"   "exac_non_TCGA.AC_AFR"   
## [15] "exac_non_TCGA.AN_AFR"    "exac_non_TCGA.Hom_AFR"  
## [17] "exac_non_TCGA.Het_AFR"   "exac_non_TCGA.Hemi_AFR" 
## [19] "exac_non_TCGA.AC_AMR"    "exac_non_TCGA.AN_AMR"   
## [21] "exac_non_TCGA.Hom_AMR"   "exac_non_TCGA.Het_AMR"  
## [23] "exac_non_TCGA.Hemi_AMR"  "exac_non_TCGA.AC_EAS"   
## [25] "exac_non_TCGA.AN_EAS"    "exac_non_TCGA.Hom_EAS"  
## [27] "exac_non_TCGA.Het_EAS"   "exac_non_TCGA.Hemi_EAS" 
## [29] "exac_non_TCGA.AC_FIN"    "exac_non_TCGA.AN_FIN"   
## [31] "exac_non_TCGA.Hom_FIN"   "exac_non_TCGA.Het_FIN"  
## [33] "exac_non_TCGA.Hemi_FIN"  "exac_non_TCGA.AC_NFE"   
## [35] "exac_non_TCGA.AN_NFE"    "exac_non_TCGA.Hom_NFE"  
## [37] "exac_non_TCGA.Het_NFE"   "exac_non_TCGA.Hemi_NFE" 
## [39] "exac_non_TCGA.AC_SAS"    "exac_non_TCGA.AN_SAS"   
## [41] "exac_non_TCGA.Hom_SAS"   "exac_non_TCGA.Het_SAS"  
## [43] "exac_non_TCGA.Hemi_SAS"  "exac_non_TCGA.AC_OTH"   
## [45] "exac_non_TCGA.AN_OTH"    "exac_non_TCGA.Hom_OTH"  
## [47] "exac_non_TCGA.Het_OTH"   "exac_non_TCGA.Hemi_OTH"
wecare_nfe_exac.df[1:5,1:5]
##                SplitVarID exac_non_TCGA.AF exac_non_TCGA.AC
## Var000000002 Var000000002        0.0120000              175
## Var000000008 Var000000008               NA               NA
## Var000000009 Var000000009               NA               NA
## Var000000012 Var000000012               NA               NA
## Var000000013 Var000000013        0.0005838               62
##              exac_non_TCGA.AN exac_non_TCGA.AC_FEMALE
## Var000000002            14012                      59
## Var000000008               NA                      NA
## Var000000009               NA                      NA
## Var000000012               NA                      NA
## Var000000013           106198                      17
# Check consistency of colnames and rownames
sum(colnames(wecare_nfe_genotypes.mx) != rownames(wecare_nfe_phenotypes.df))
## [1] 0
sum(rownames(wecare_nfe_genotypes.mx) != rownames(wecare_nfe_variants.df))
## [1] 0
sum(rownames(wecare_nfe_genotypes.mx) != rownames(wecare_nfe_kgen.df))
## [1] 0
sum(rownames(wecare_nfe_genotypes.mx) != rownames(wecare_nfe_exac.df))
## [1] 0

function_to_calculate_eigenvectors

Implements procedure described by Price et al 2006 (PMID: 16862161)

normalise_and_calculate_eigenvectors.udf <- function(x) {
  
  # --- Center and normalise variants (rows) --- #
  
  # Center by mean
  avg.rows <- apply(x, 1, mean, na.rm=TRUE)
  x.c <- x - avg.rows
  
  # Normalise by sqrt(p(1-p)) where p~"posterior estimate of unobserved allele frequency"
  # This is motivated by the fact that genetic drift per generation is proportional to this normalisation value (Patterson 2006)
  # Also this makes each column to have same variance
  # 
  p.fnc <- function(x) (1 + sum(x, na.rm=TRUE)) / (2 + 2 * sum(!is.na(x)))
  p <- apply(x, 1, p.fnc)
  eaf <- sqrt(p*(1-p))
  x.cn <- x.c/eaf
  
  # Substitute NAs to zeros
  0 -> x.cn[is.na(x)]
  
  # --- Calculate eigenvectors of covariance matrix of cases --- #
  
  cov.mx <- cov(x.cn)
  eig <- eigen(cov.mx) # eigenvectors in columns
  
  return(eig)

}

make_wecare_only_subset_of_data

# Remove nfe cases
wecare_cases <- as.vector(wecare_nfe_phenotypes.df[wecare_nfe_phenotypes.df$cc!=-1,"wes_id"])
length(wecare_cases) # 480
## [1] 480
wecare_genotypes.mx <- wecare_nfe_genotypes.mx[,wecare_cases]
dim(wecare_genotypes.mx) # 275,516 x 480
## [1] 275516    480
wecare_genotypes.mx[1:5,1:5]
##              P1_A01 P1_A02 P1_A03 P1_A04 P1_A05
## Var000000002      0      0      0      0      0
## Var000000008      0      0     NA      0     NA
## Var000000009      0     NA     NA      0     NA
## Var000000012      0     NA     NA      0      0
## Var000000013      0     NA      0      0      0
wecare_phenotypes.df <- wecare_nfe_phenotypes.df[wecare_cases,]
dim(wecare_phenotypes.df) # 480 x 28
## [1] 480  28
wecare_phenotypes.df[1:5,1:5]
##        wes_id  gwas_id       merged_id filter cc
## P1_A01 P1_A01 id203568 P1_A01_id203568   pass  1
## P1_A02 P1_A02 id357807 P1_A02_id357807   pass  1
## P1_A03 P1_A03 id325472 P1_A03_id325472   pass  1
## P1_A04 P1_A04 id304354 P1_A04_id304354   pass  1
## P1_A05 P1_A05 id222648 P1_A05_id222648   pass  1
# Clean-up
rm(wecare_cases)

remove_variants_with_uniform_genotypes_accross_all_wecare_samples

Remove 50,141 variants: 275,516 -> 225,375

# Check that there is no all-NA variants
non_NA_count.udf <- function(x){sum(!is.na(x))}
all_NA <- apply(wecare_genotypes.mx, 1, non_NA_count.udf) == 0
sum(all_NA) # 0
## [1] 0
# Function to detect uniform numeric vector
uniform_vector.udf <- function(x){
  if(min(x, na.rm=TRUE) == max(x, na.rm=TRUE)){return(TRUE)} else {return(FALSE)}}

# Variants with uniform genotypes accross all samples 
uniform_genotypes <- apply(wecare_genotypes.mx, 1, uniform_vector.udf)
summary(uniform_genotypes)
##    Mode   FALSE    TRUE    NA's 
## logical  225375   50141       0
sum(uniform_genotypes) # 50,141
## [1] 50141
# Remove variants with uniform genotypes accross all samples
wecare_genotypes.mx <- wecare_genotypes.mx[!uniform_genotypes,]
dim(wecare_genotypes.mx) # 225375    480
## [1] 225375    480
wecare_genotypes.mx[1:5,1:5]
##              P1_A01 P1_A02 P1_A03 P1_A04 P1_A05
## Var000000002      0      0      0      0      0
## Var000000008      0      0     NA      0     NA
## Var000000009      0     NA     NA      0     NA
## Var000000012      0     NA     NA      0      0
## Var000000013      0     NA      0      0      0
wecare_variants.df <- wecare_nfe_variants.df[!uniform_genotypes,]
dim(wecare_variants.df) # 225375     23
## [1] 225375     23
wecare_variants.df[1:5,1:5]
##                SplitVarID    SYMBOL  TYPE CHROM    POS
## Var000000002 Var000000002 LINC00115   SNP     1 762330
## Var000000008 Var000000008    SAMD11   SNP     1 871215
## Var000000009 Var000000009    SAMD11   SNP     1 871230
## Var000000012 Var000000012    SAMD11 INDEL     1 874778
## Var000000013 Var000000013    SAMD11   SNP     1 878664
wecare_kgen.df <- wecare_nfe_kgen.df[!uniform_genotypes,]
dim(wecare_kgen.df) # 225375      9
## [1] 225375      9
wecare_kgen.df[1:5,1:5]
##                SplitVarID kgen.AC kgen.AN     kgen.AF kgen.AFR_AF
## Var000000002 Var000000002      NA      NA          NA          NA
## Var000000008 Var000000008      NA      NA          NA          NA
## Var000000009 Var000000009      NA      NA          NA          NA
## Var000000012 Var000000012      NA      NA          NA          NA
## Var000000013 Var000000013       2    5008 0.000399361           0
wecare_exac.df <- wecare_nfe_exac.df[!uniform_genotypes,]
dim(wecare_exac.df) # 225375     48
## [1] 225375     48
wecare_exac.df[1:5,1:5]
##                SplitVarID exac_non_TCGA.AF exac_non_TCGA.AC
## Var000000002 Var000000002        0.0120000              175
## Var000000008 Var000000008               NA               NA
## Var000000009 Var000000009               NA               NA
## Var000000012 Var000000012               NA               NA
## Var000000013 Var000000013        0.0005838               62
##              exac_non_TCGA.AN exac_non_TCGA.AC_FEMALE
## Var000000002            14012                      59
## Var000000008               NA                      NA
## Var000000009               NA                      NA
## Var000000012               NA                      NA
## Var000000013           106198                      17
# Clean-up
rm(non_NA_count.udf, all_NA, uniform_vector.udf, uniform_genotypes)

calculate_and_plot_eigenvectors_wecare_all_variants

# --- Calculate eigenvectors --- #

wecare_eigen <- normalise_and_calculate_eigenvectors.udf(wecare_genotypes.mx)

wecare_all_variants_eigenvectors.mx.df <- as.data.frame(wecare_eigen$vectors) # eigenvectors in columns
wecare_all_variants_eigenvalues <- wecare_eigen$values

# --- Prepare data for plotting --- #

# Prepare colour scale
colours <- c("UBC" = "BLUE", "CBC" = "RED")
userColourScale <- scale_colour_manual(values=colours)

# Prepare cases lables
cases_labels <- as.vector(wecare_phenotypes.df$cc)
"CBC" -> cases_labels[cases_labels==1]
"UBC" -> cases_labels[cases_labels==0]

# Prepare cases IDs
cases_IDs <- as.vector(wecare_phenotypes.df$wes_id)

# make the dataframe
data2plot.df <- cbind(cases_IDs, cases_labels, wecare_all_variants_eigenvectors.mx.df[,1:3])
colnames(data2plot.df) <- c("wes_id", "group", "ev1", "ev2", "ev3")

# --- Plot eig1 vs eig2 --- #

g <- ggplot(data2plot.df, aes(ev1, ev2)) +
  geom_point(aes(colour=group, fill=group, text = wes_id)) + 
  labs(title="wecare all variants<br>(225,375 x 480)", x ="eigenvector1", y = "eigenvector2") +
  userColourScale
## Warning: Ignoring unknown aesthetics: text
ggplotly(g)
# --- Plot eig2 vs eig3 --- #

g <- ggplot(data2plot.df, aes(ev2, ev3)) +
  geom_point(aes(colour=group, fill=group, text = wes_id)) + 
  labs(title="wecare all variants<br>(225,375 x 480)", x ="eigenvector2", y = "eigenvector3") +
  userColourScale
## Warning: Ignoring unknown aesthetics: text
ggplotly(g)
# --- Clean-up --- #

rm(wecare_all_variants_eigenvalues, g, data2plot.df)

calculate_outliers_all_variants

Using 6 standard deviations in 5 eigenvectors

wecare_all_variants_eigenvectors.mx <- wecare_eigen$vectors

ev1 <- wecare_all_variants_eigenvectors.mx[,1]
ev1.positive_outliers <- ev1 > mean(ev1) + 6 * sd(ev1)
ev1.negative_outliers <- ev1 < mean(ev1) - 6 * sd(ev1)
sum(ev1.positive_outliers)
## [1] 4
sum(ev1.negative_outliers)
## [1] 0
wecare_phenotypes.df$wes_id[ev1.positive_outliers]
## [1] "P1_C12" "P1_D08" "P5_E09" "P6_D05"
wecare_phenotypes.df$wes_id[ev1.negative_outliers]
## character(0)
ev2 <- wecare_all_variants_eigenvectors.mx[,2]
ev2.positive_outliers <- ev2 > mean(ev2) + 6 * sd(ev2)
ev2.negative_outliers <- ev2 < mean(ev2) - 6 * sd(ev2)
sum(ev2.positive_outliers)
## [1] 0
sum(ev2.negative_outliers)
## [1] 1
wecare_phenotypes.df$wes_id[ev2.positive_outliers]
## character(0)
wecare_phenotypes.df$wes_id[ev2.negative_outliers]
## [1] "P1_E06"
ev3 <- wecare_all_variants_eigenvectors.mx[,3]
ev3.positive_outliers <- ev3 > mean(ev3) + 6 * sd(ev3)
ev3.negative_outliers <- ev3 < mean(ev3) - 6 * sd(ev3)
sum(ev3.positive_outliers)
## [1] 1
sum(ev3.negative_outliers)
## [1] 2
wecare_phenotypes.df$wes_id[ev3.positive_outliers]
## [1] "P6_D05"
wecare_phenotypes.df$wes_id[ev3.negative_outliers]
## [1] "P1_C12" "P5_E09"
ev4 <- wecare_all_variants_eigenvectors.mx[,4]
ev4.positive_outliers <- ev4 > mean(ev4) + 6 * sd(ev4)
ev4.negative_outliers <- ev4 < mean(ev4) - 6 * sd(ev4)
sum(ev4.positive_outliers)
## [1] 1
sum(ev4.negative_outliers)
## [1] 1
wecare_phenotypes.df$wes_id[ev4.positive_outliers]
## [1] "P1_C12"
wecare_phenotypes.df$wes_id[ev4.negative_outliers]
## [1] "P5_E09"
ev5 <- wecare_all_variants_eigenvectors.mx[,5]
ev5.positive_outliers <- ev5 > mean(ev5) + 6 * sd(ev5)
ev5.negative_outliers <- ev5 < mean(ev5) - 6 * sd(ev5)
sum(ev5.positive_outliers)
## [1] 1
sum(ev5.negative_outliers)
## [1] 0
wecare_phenotypes.df$wes_id[ev5.positive_outliers]
## [1] "P1_E06"
wecare_phenotypes.df$wes_id[ev5.negative_outliers]
## character(0)
# Clean-up
rm(wecare_all_variants_eigenvectors.mx, ev1, ev1.positive_outliers, ev1.negative_outliers, 
   ev2, ev2.positive_outliers, ev2.negative_outliers, ev3, ev3.positive_outliers, ev3.negative_outliers,
   ev4, ev4.positive_outliers, ev4.negative_outliers, ev5, ev5.positive_outliers, ev5.negative_outliers)

export_wecare_all_variants_to_EIGENSTRAT_format

# --- make snp file --- #

# Extract data from variants table
wecare_all_variants_snp.df <- wecare_variants.df %>% 
  mutate(morgan = POS / 1000000) %>%  
  select(SplitVarID, CHROM, morgan, POS, REF, ALT)

# Recode CRHOM data
summary(wecare_all_variants_snp.df$CHROM)
##     1    10    11    12    13    14    15    16    17    18    19     2 
## 22787  9409 13267 11893  4130  7634  7668  9444 12300  3822 13376 16992 
##    20    21    22     3     4     5     6     7     8     9    MT     X 
##  5406  2740  4940 13234  9060 10265 13081 10359  7728  9305   361  6173 
##     Y 
##     1
str(wecare_all_variants_snp.df)
## 'data.frame':    225375 obs. of  6 variables:
##  $ SplitVarID: Factor w/ 343824 levels "Var000000001",..: 2 8 9 12 13 20 21 23 24 25 ...
##  $ CHROM     : Factor w/ 25 levels "1","10","11",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ morgan    : num  0.762 0.871 0.871 0.875 0.879 ...
##  $ POS       : int  762330 871215 871230 874778 878664 880461 880483 880917 881627 881825 ...
##  $ REF       : Factor w/ 2142 levels "A","AAAAAAAAAAAAAAAAAAAAAAAG",..: 1044 539 539 1231 1044 539 1 1044 1044 539 ...
##  $ ALT       : Factor w/ 1297 levels "A","AAAAAAAAAAAAACT",..: 989 664 989 664 1 989 664 1 1 989 ...
wecare_all_variants_snp.df$CHROM <- as.vector(wecare_all_variants_snp.df$CHROM)
str(wecare_all_variants_snp.df)
## 'data.frame':    225375 obs. of  6 variables:
##  $ SplitVarID: Factor w/ 343824 levels "Var000000001",..: 2 8 9 12 13 20 21 23 24 25 ...
##  $ CHROM     : chr  "1" "1" "1" "1" ...
##  $ morgan    : num  0.762 0.871 0.871 0.875 0.879 ...
##  $ POS       : int  762330 871215 871230 874778 878664 880461 880483 880917 881627 881825 ...
##  $ REF       : Factor w/ 2142 levels "A","AAAAAAAAAAAAAAAAAAAAAAAG",..: 1044 539 539 1231 1044 539 1 1044 1044 539 ...
##  $ ALT       : Factor w/ 1297 levels "A","AAAAAAAAAAAAACT",..: 989 664 989 664 1 989 664 1 1 989 ...
"23" -> wecare_all_variants_snp.df[wecare_all_variants_snp.df$CHROM == "X", "CHROM"]
"24" -> wecare_all_variants_snp.df[wecare_all_variants_snp.df$CHROM == "Y", "CHROM"]
"90" -> wecare_all_variants_snp.df[wecare_all_variants_snp.df$CHROM == "MT", "CHROM"]

wecare_all_variants_snp.df$CHROM <- as.factor(wecare_all_variants_snp.df$CHROM)
summary(wecare_all_variants_snp.df$CHROM)
##     1    10    11    12    13    14    15    16    17    18    19     2 
## 22787  9409 13267 11893  4130  7634  7668  9444 12300  3822 13376 16992 
##    20    21    22    23    24     3     4     5     6     7     8     9 
##  5406  2740  4940  6173     1 13234  9060 10265 13081 10359  7728  9305 
##    90 
##   361
# Write file (tab-separated)
write.table(wecare_all_variants_snp.df, 
            paste(interim_data_folder, "wecare_all_variants.snp", sep="/"), 
            quote=FALSE, sep="\t", 
            row.names=FALSE, col.names=FALSE) 

# --- make geno file --- #

# Get data from genotypes matrix
wecare_all_variants_geno.mx <- wecare_genotypes.mx

# Recode NA
9 -> wecare_all_variants_geno.mx[is.na(wecare_all_variants_geno.mx)] # NA is coded as 9

# Write file (no delimiters!)
write.table(wecare_all_variants_geno.mx, 
            paste(interim_data_folder, "wecare_all_variants.geno", sep="/"),             
            quote=FALSE, sep="", 
            row.names=FALSE, col.names=FALSE)

# --- make ind file --- #

# Samples column
samples <- as.vector(wecare_phenotypes.df$wes_id)

# Gender column
gender <- rep("F", length(samples)) # all females

# Population/Group column
group <- as.vector(wecare_phenotypes.df$cc)
"UBC" -> group[group == 0]
"CBC" -> group[group == 1]

# Get together
wecare_ind.mx <- cbind(samples, gender, group)

# Write file (tab-separated) 
write.table(wecare_ind.mx, 
            paste(interim_data_folder, "wecare.ind", sep="/"),
            quote=FALSE, sep="\t", 
            row.names=FALSE, col.names=FALSE)

# Clean-up
rm(wecare_all_variants_snp.df, wecare_all_variants_geno.mx, wecare_ind.mx, samples, gender, group)

calculate_AFs

# Calculate ac, an and af

ac_wecare_cln <- apply(wecare_genotypes.mx, 1, sum, na.rm=TRUE)
get_allele_number.udf <- function(x){2*sum(!is.na(x))}
an_wecare_cln <- apply(wecare_genotypes.mx, 1, get_allele_number.udf)
af_wecare_cln <- ac_wecare_cln/an_wecare_cln

# Ceck AFs 
# (note that uniform variants were excluded)
ac_wecare_cln[1:10]
## Var000000002 Var000000008 Var000000009 Var000000012 Var000000013 
##            4            3            1           37            1 
## Var000000020 Var000000021 Var000000023 Var000000024 Var000000025 
##            1            1            1          581            1
an_wecare_cln[1:10]
## Var000000002 Var000000008 Var000000009 Var000000012 Var000000013 
##          950          834          806          776          820 
## Var000000020 Var000000021 Var000000023 Var000000024 Var000000025 
##          956          958          958          902          928
af_wecare_cln[1:10]
## Var000000002 Var000000008 Var000000009 Var000000012 Var000000013 
##  0.004210526  0.003597122  0.001240695  0.047680412  0.001219512 
## Var000000020 Var000000021 Var000000023 Var000000024 Var000000025 
##  0.001046025  0.001043841  0.001043841  0.644124169  0.001077586
min(ac_wecare_cln)
## [1] 1
min(an_wecare_cln)
## [1] 688
min(af_wecare_cln)
## [1] 0.001041667
max(ac_wecare_cln)
## [1] 959
max(an_wecare_cln)
## [1] 960
max(af_wecare_cln)
## [1] 0.9989583
# Add updated AFs to wecare_variants.df
wecare_variants.df <- cbind(wecare_variants.df, 
  ac_wecare_cln, an_wecare_cln, af_wecare_cln)

calculate_and_plot_eigenvectors_wecare_common_variants

# --- Exclude rare variants --- #
# Note exclusion on both sides: high- and low- AFs
# Low AFs remove rare variants with common allele in reference genome
# Hight AFs remove rare variants with common allele in reference genome

wecare_common_vars <- af_wecare_cln > 0.05 & af_wecare_cln < 0.95
sum(wecare_common_vars) # 44,517
## [1] 44517
min(af_wecare_cln[wecare_common_vars])
## [1] 0.05010438
max(af_wecare_cln[wecare_common_vars])
## [1] 0.9498956
common_wecare_genotypes.mx <- wecare_genotypes.mx[wecare_common_vars,]
dim(common_wecare_genotypes.mx)
## [1] 44517   480
common_wecare_genotypes.mx[1:5,1:5]
##              P1_A01 P1_A02 P1_A03 P1_A04 P1_A05
## Var000000024      2      1     NA      1      2
## Var000000027      0     NA     NA      1     NA
## Var000000037      0     NA     NA      0     NA
## Var000000054      2      2     NA      2     NA
## Var000000058      2      2      2      2      2
common_wecare_variants.df <- wecare_variants.df[wecare_common_vars,]
dim(common_wecare_variants.df)
## [1] 44517    26
common_wecare_variants.df[1:5,1:5]
##                SplitVarID SYMBOL TYPE CHROM    POS
## Var000000024 Var000000024  NOC2L  SNP     1 881627
## Var000000027 Var000000027  NOC2L  SNP     1 881918
## Var000000037 Var000000037  NOC2L  SNP     1 889238
## Var000000054 Var000000054 KLHL17  SNP     1 897325
## Var000000058 Var000000058 KLHL17  SNP     1 897564
common_wecare_kgen.df <- wecare_kgen.df[wecare_common_vars,]
dim(common_wecare_kgen.df)
## [1] 44517     9
common_wecare_kgen.df[1:5,1:5]
##                SplitVarID kgen.AC kgen.AN   kgen.AF kgen.AFR_AF
## Var000000024 Var000000024    2213    5008 0.4418930      0.0643
## Var000000027 Var000000027     147    5008 0.0293530      0.0038
## Var000000037 Var000000037     283    5008 0.0565096      0.0197
## Var000000054 Var000000054    4303    5008 0.8592250      0.6959
## Var000000058 Var000000058    4507    5008 0.8999600      0.8298
common_wecare_exac.df <- wecare_exac.df[wecare_common_vars,]
dim(common_wecare_exac.df)
## [1] 44517    48
common_wecare_exac.df[1:5,1:5]
##                SplitVarID exac_non_TCGA.AF exac_non_TCGA.AC
## Var000000024 Var000000024            0.562            59637
## Var000000027 Var000000027            0.045             4735
## Var000000037 Var000000037               NA               NA
## Var000000054 Var000000054               NA               NA
## Var000000058 Var000000058            0.898            70551
##              exac_non_TCGA.AN exac_non_TCGA.AC_FEMALE
## Var000000024           106210                   24874
## Var000000027           106208                    1867
## Var000000037               NA                      NA
## Var000000054               NA                      NA
## Var000000058            78540                    4650
# --- Calculate eigenvectors --- #

common_wecare_eigen <- normalise_and_calculate_eigenvectors.udf(common_wecare_genotypes.mx)

wecare_common_variants_eigenvectors.df <- as.data.frame(common_wecare_eigen$vectors) # eigenvectors in columns
wecare_common_variants_eigenvalues <- common_wecare_eigen$values

# --- Prepare data for plotting --- #

data2plot.df <- cbind(cases_IDs, cases_labels, wecare_common_variants_eigenvectors.df[,1:3])
colnames(data2plot.df) <- c("wes_id", "group", "ev1", "ev2", "ev3")

# --- Plot eig1 vs eig2 --- #

g <- ggplot(data2plot.df, aes(-ev1, ev2)) +
  geom_point(aes(colour=group, fill=group, text = wes_id)) + 
  labs(title="wecare common variants<br>(44,517 x 480)", x ="-eigenvector1", y = "eigenvector2") +
  userColourScale
## Warning: Ignoring unknown aesthetics: text
ggplotly(g)
# --- Plot eig2 vs eig3 --- #

g <- ggplot(data2plot.df, aes(ev2, ev3)) +
  geom_point(aes(colour=group, fill=group, text = wes_id)) + 
  labs(title="wecare common variants<br>(44,517 x 480)", x ="eigenvector2", y = "eigenvector3") +
  userColourScale
## Warning: Ignoring unknown aesthetics: text
ggplotly(g)
# --- Clean-up --- #

rm(wecare_common_variants_eigenvectors.df, wecare_common_variants_eigenvalues, g, data2plot.df, 
   cases_labels, cases_IDs, colours, userColourScale, wecare_common_vars, 
   ac_wecare_cln, an_wecare_cln, af_wecare_cln, 
   get_allele_number.udf, normalise_and_calculate_eigenvectors.udf)

export_wecare_common_variants_to_EIGENSTRAT_format

# --- make snp file --- #

# Extract data from variants table
wecare_common_variants_snp.df <- common_wecare_variants.df %>% 
  mutate(morgan = POS / 1000000) %>%  
  select(SplitVarID, CHROM, morgan, POS, REF, ALT)

# Recode CRHOM data
summary(wecare_common_variants_snp.df$CHROM)
##    1   10   11   12   13   14   15   16   17   18   19    2   20   21   22 
## 4352 1983 2783 2342  816 1587 1401 1682 2417  788 2984 3200 1118  594 1008 
##    3    4    5    6    7    8    9   MT    X    Y 
## 2485 1796 2097 2888 2083 1471 1821   16  804    1
str(wecare_common_variants_snp.df)
## 'data.frame':    44517 obs. of  6 variables:
##  $ SplitVarID: Factor w/ 343824 levels "Var000000001",..: 24 27 37 54 58 62 75 99 109 139 ...
##  $ CHROM     : Factor w/ 25 levels "1","10","11",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ morgan    : num  0.882 0.882 0.889 0.897 0.898 ...
##  $ POS       : int  881627 881918 889238 897325 897564 897738 900505 909309 911916 948846 ...
##  $ REF       : Factor w/ 2142 levels "A","AAAAAAAAAAAAAAAAAAAAAAAG",..: 1044 1044 1044 1044 1575 539 1044 1575 539 1575 ...
##  $ ALT       : Factor w/ 1297 levels "A","AAAAAAAAAAAAACT",..: 1 1 1 323 323 989 323 323 989 990 ...
wecare_common_variants_snp.df$CHROM <- as.vector(wecare_common_variants_snp.df$CHROM)
str(wecare_common_variants_snp.df)
## 'data.frame':    44517 obs. of  6 variables:
##  $ SplitVarID: Factor w/ 343824 levels "Var000000001",..: 24 27 37 54 58 62 75 99 109 139 ...
##  $ CHROM     : chr  "1" "1" "1" "1" ...
##  $ morgan    : num  0.882 0.882 0.889 0.897 0.898 ...
##  $ POS       : int  881627 881918 889238 897325 897564 897738 900505 909309 911916 948846 ...
##  $ REF       : Factor w/ 2142 levels "A","AAAAAAAAAAAAAAAAAAAAAAAG",..: 1044 1044 1044 1044 1575 539 1044 1575 539 1575 ...
##  $ ALT       : Factor w/ 1297 levels "A","AAAAAAAAAAAAACT",..: 1 1 1 323 323 989 323 323 989 990 ...
"23" -> wecare_common_variants_snp.df[wecare_common_variants_snp.df$CHROM == "X", "CHROM"]
"24" -> wecare_common_variants_snp.df[wecare_common_variants_snp.df$CHROM == "Y", "CHROM"]
"90" -> wecare_common_variants_snp.df[wecare_common_variants_snp.df$CHROM == "MT", "CHROM"]

wecare_common_variants_snp.df$CHROM <- as.factor(wecare_common_variants_snp.df$CHROM)
summary(wecare_common_variants_snp.df$CHROM)
##    1   10   11   12   13   14   15   16   17   18   19    2   20   21   22 
## 4352 1983 2783 2342  816 1587 1401 1682 2417  788 2984 3200 1118  594 1008 
##   23   24    3    4    5    6    7    8    9   90 
##  804    1 2485 1796 2097 2888 2083 1471 1821   16
# Write file (tab-separated)
write.table(wecare_common_variants_snp.df, 
            paste(interim_data_folder, "wecare_common_variants.snp", sep="/"), 
            quote=FALSE, sep="\t", 
            row.names=FALSE, col.names=FALSE) 

# --- make geno file --- #

# Get data from genotypes matrix
wecare_common_variants_geno.mx <- common_wecare_genotypes.mx

# Recode NA
9 -> wecare_common_variants_geno.mx[is.na(wecare_common_variants_geno.mx)] # NA is coded as 9

# Write file (no delimiters)
write.table(wecare_common_variants_geno.mx, 
            paste(interim_data_folder, "wecare_common_variants.geno", sep="/"),             
            quote=FALSE, sep="", 
            row.names=FALSE, col.names=FALSE)

# --- make ind file --- #
# has been made for wecare all variants

# Clean-up
rm(wecare_common_variants_snp.df, wecare_common_variants_geno.mx)

compare_eigenvalues_and_eigenvectors_in_all_and_common_variants

# --- Compare eigenvalues --- #

eval_all <- wecare_eigen$values
eval_common <- common_wecare_eigen$values

plot(eval_all, main="Wecare eigenvalues (all variants)")

plot(eval_common, main="Wecare igenvalues (common variants)")

plot(eval_all,eval_common, main="Wecare eigenvalues (all vs common variants)")

# --- Compare eigenvectors --- #

# Gather data
ev1_all <- wecare_eigen$vectors[,1]
ev1_common <- common_wecare_eigen$vectors[,1]
ev2_all <- wecare_eigen$vectors[,2]
ev2_common <- common_wecare_eigen$vectors[,2]
ev3_all <- wecare_eigen$vectors[,3]
ev3_common <- common_wecare_eigen$vectors[,3]

data2plot.df <- as.data.frame(cbind(ev1_all, ev2_all, ev3_all, ev1_common, ev2_common, ev3_common))

# Calculate correlations
cor.test(ev1_all, ev1_common) # -0.482982, p-value = 2.2e-16
## 
##  Pearson's product-moment correlation
## 
## data:  ev1_all and ev1_common
## t = -12.059, df = 478, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.5487611 -0.4112591
## sample estimates:
##       cor 
## -0.482982
cor.test(ev2_all, ev2_common) # -0.2196824, p-value = 1.174e-06
## 
##  Pearson's product-moment correlation
## 
## data:  ev2_all and ev2_common
## t = -4.9232, df = 478, p-value = 1.174e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.3032210 -0.1327928
## sample estimates:
##        cor 
## -0.2196824
cor.test(ev3_all, ev3_common) # -0.06442268, p-value = 0.1588
## 
##  Pearson's product-moment correlation
## 
## data:  ev3_all and ev3_common
## t = -1.4114, df = 478, p-value = 0.1588
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.15304072  0.02522321
## sample estimates:
##         cor 
## -0.06442268
# Common sence check (these eigenvectors should be orthogonal...)
cor.test(ev1_all, ev2_all) # -3.530534e-16, p-value = 1
## 
##  Pearson's product-moment correlation
## 
## data:  ev1_all and ev2_all
## t = -2.1423e-14, df = 478, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.08950045  0.08950045
## sample estimates:
##           cor 
## -9.798722e-16
cor.test(ev1_common, ev2_common) # -6.391521e-16, p-value = 1
## 
##  Pearson's product-moment correlation
## 
## data:  ev1_common and ev2_common
## t = 4.9748e-15, df = 478, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.08950045  0.08950045
## sample estimates:
##          cor 
## 2.275435e-16
# Make plots
g <- ggplot(data2plot.df, aes(ev1_all, ev1_common)) +
  geom_point() + 
  labs(title="Wecare eigenvector 1<br>all vs common variants (p=2e-16)")
ggplotly(g)
g <- ggplot(data2plot.df, aes(ev2_all, ev2_common)) +
  geom_point() + 
  labs(title="Wecare eigenvector 2<br>all vs common variants (p=1e-06)")
ggplotly(g)
g <- ggplot(data2plot.df, aes(ev3_all, ev3_common)) +
  geom_point() + 
  labs(title="Wecare eigenvector 3<br>all vs common variants (p=0.16)")
ggplotly(g)
# Clean-up
rm(eval_all, eval_common, ev1_all, ev2_all, ev3_all, ev1_common, ev2_common, ev3_common, g, data2plot.df)

data_summary

ls()
##  [1] "common_wecare_eigen"                   
##  [2] "common_wecare_exac.df"                 
##  [3] "common_wecare_genotypes.mx"            
##  [4] "common_wecare_kgen.df"                 
##  [5] "common_wecare_variants.df"             
##  [6] "interim_data_folder"                   
##  [7] "wecare_all_variants_eigenvectors.mx.df"
##  [8] "wecare_eigen"                          
##  [9] "wecare_exac.df"                        
## [10] "wecare_genotypes.mx"                   
## [11] "wecare_kgen.df"                        
## [12] "wecare_nfe_exac.df"                    
## [13] "wecare_nfe_genotypes.mx"               
## [14] "wecare_nfe_kgen.df"                    
## [15] "wecare_nfe_phenotypes.df"              
## [16] "wecare_nfe_variants.df"                
## [17] "wecare_phenotypes.df"                  
## [18] "wecare_variants.df"
# --- wecare nfe --- #

dim(wecare_nfe_genotypes.mx)
## [1] 275516    678
class(wecare_nfe_genotypes.mx)
## [1] "matrix"
wecare_nfe_genotypes.mx[1:5,1:5]
##              HG00097 HG00099 HG00100 HG00102 HG00106
## Var000000002       0       0      NA      NA       0
## Var000000008       0      NA       0       0       0
## Var000000009       0       0       0       0       0
## Var000000012       0       0      NA       0       0
## Var000000013       0       0      NA       0       0
dim(wecare_nfe_phenotypes.df)
## [1] 678  28
str(wecare_nfe_phenotypes.df)
## 'data.frame':    678 obs. of  28 variables:
##  $ wes_id        : chr  "HG00097" "HG00099" "HG00100" "HG00102" ...
##  $ gwas_id       : chr  NA NA NA NA ...
##  $ merged_id     : chr  NA NA NA NA ...
##  $ filter        : chr  "pass" "pass" "pass" "pass" ...
##  $ cc            : num  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ setno         : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ family_history: int  NA NA NA NA NA NA NA NA NA NA ...
##  $ age_dx        : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ age_ref       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ rstime        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ eig1_gwas     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ eig2_gwas     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ eig3_gwas     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ stage         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ er            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ pr            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ hist_cat      : chr  NA NA NA NA ...
##  $ hormone       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ chemo_cat     : chr  NA NA NA NA ...
##  $ br_xray       : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ br_xray_dose  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ age_menarche  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ age_1st_ftp   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ age_menopause : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ num_preg      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ bmi_age18     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ bmi_dx        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ bmi_ref       : num  NA NA NA NA NA NA NA NA NA NA ...
wecare_nfe_phenotypes.df[1:5,1:5]
##          wes_id gwas_id merged_id filter cc
## HG00097 HG00097    <NA>      <NA>   pass -1
## HG00099 HG00099    <NA>      <NA>   pass -1
## HG00100 HG00100    <NA>      <NA>   pass -1
## HG00102 HG00102    <NA>      <NA>   pass -1
## HG00106 HG00106    <NA>      <NA>   pass -1
dim(wecare_nfe_variants.df)
## [1] 275516     23
colnames(wecare_nfe_variants.df)
##  [1] "SplitVarID"         "SYMBOL"             "TYPE"              
##  [4] "CHROM"              "POS"                "REF"               
##  [7] "ALT"                "AC"                 "AF"                
## [10] "AN"                 "Consequence"        "SIFT_call"         
## [13] "SIFT_score"         "PolyPhen_call"      "PolyPhen_score"    
## [16] "CLIN_SIG"           "cDNA_position"      "CDS_position"      
## [19] "Codons"             "Protein_position"   "Amino_acids"       
## [22] "Existing_variation" "Multiallelic"
wecare_nfe_variants.df[1:5,1:5]
##                SplitVarID    SYMBOL  TYPE CHROM    POS
## Var000000002 Var000000002 LINC00115   SNP     1 762330
## Var000000008 Var000000008    SAMD11   SNP     1 871215
## Var000000009 Var000000009    SAMD11   SNP     1 871230
## Var000000012 Var000000012    SAMD11 INDEL     1 874778
## Var000000013 Var000000013    SAMD11   SNP     1 878664
dim(wecare_nfe_kgen.df)
## [1] 275516      9
colnames(wecare_nfe_kgen.df)
## [1] "SplitVarID"  "kgen.AC"     "kgen.AN"     "kgen.AF"     "kgen.AFR_AF"
## [6] "kgen.AMR_AF" "kgen.EAS_AF" "kgen.EUR_AF" "kgen.SAS_AF"
wecare_nfe_kgen.df[1:5,1:5]
##                SplitVarID kgen.AC kgen.AN     kgen.AF kgen.AFR_AF
## Var000000002 Var000000002      NA      NA          NA          NA
## Var000000008 Var000000008      NA      NA          NA          NA
## Var000000009 Var000000009      NA      NA          NA          NA
## Var000000012 Var000000012      NA      NA          NA          NA
## Var000000013 Var000000013       2    5008 0.000399361           0
dim(wecare_nfe_exac.df)
## [1] 275516     48
colnames(wecare_nfe_exac.df)
##  [1] "SplitVarID"              "exac_non_TCGA.AF"       
##  [3] "exac_non_TCGA.AC"        "exac_non_TCGA.AN"       
##  [5] "exac_non_TCGA.AC_FEMALE" "exac_non_TCGA.AN_FEMALE"
##  [7] "exac_non_TCGA.AC_MALE"   "exac_non_TCGA.AN_MALE"  
##  [9] "exac_non_TCGA.AC_Adj"    "exac_non_TCGA.AN_Adj"   
## [11] "exac_non_TCGA.AC_Hom"    "exac_non_TCGA.AC_Het"   
## [13] "exac_non_TCGA.AC_Hemi"   "exac_non_TCGA.AC_AFR"   
## [15] "exac_non_TCGA.AN_AFR"    "exac_non_TCGA.Hom_AFR"  
## [17] "exac_non_TCGA.Het_AFR"   "exac_non_TCGA.Hemi_AFR" 
## [19] "exac_non_TCGA.AC_AMR"    "exac_non_TCGA.AN_AMR"   
## [21] "exac_non_TCGA.Hom_AMR"   "exac_non_TCGA.Het_AMR"  
## [23] "exac_non_TCGA.Hemi_AMR"  "exac_non_TCGA.AC_EAS"   
## [25] "exac_non_TCGA.AN_EAS"    "exac_non_TCGA.Hom_EAS"  
## [27] "exac_non_TCGA.Het_EAS"   "exac_non_TCGA.Hemi_EAS" 
## [29] "exac_non_TCGA.AC_FIN"    "exac_non_TCGA.AN_FIN"   
## [31] "exac_non_TCGA.Hom_FIN"   "exac_non_TCGA.Het_FIN"  
## [33] "exac_non_TCGA.Hemi_FIN"  "exac_non_TCGA.AC_NFE"   
## [35] "exac_non_TCGA.AN_NFE"    "exac_non_TCGA.Hom_NFE"  
## [37] "exac_non_TCGA.Het_NFE"   "exac_non_TCGA.Hemi_NFE" 
## [39] "exac_non_TCGA.AC_SAS"    "exac_non_TCGA.AN_SAS"   
## [41] "exac_non_TCGA.Hom_SAS"   "exac_non_TCGA.Het_SAS"  
## [43] "exac_non_TCGA.Hemi_SAS"  "exac_non_TCGA.AC_OTH"   
## [45] "exac_non_TCGA.AN_OTH"    "exac_non_TCGA.Hom_OTH"  
## [47] "exac_non_TCGA.Het_OTH"   "exac_non_TCGA.Hemi_OTH"
wecare_nfe_exac.df[1:5,1:5]
##                SplitVarID exac_non_TCGA.AF exac_non_TCGA.AC
## Var000000002 Var000000002        0.0120000              175
## Var000000008 Var000000008               NA               NA
## Var000000009 Var000000009               NA               NA
## Var000000012 Var000000012               NA               NA
## Var000000013 Var000000013        0.0005838               62
##              exac_non_TCGA.AN exac_non_TCGA.AC_FEMALE
## Var000000002            14012                      59
## Var000000008               NA                      NA
## Var000000009               NA                      NA
## Var000000012               NA                      NA
## Var000000013           106198                      17
sum(colnames(wecare_nfe_genotypes.mx) != rownames(wecare_nfe_phenotypes.df))
## [1] 0
sum(rownames(wecare_nfe_genotypes.mx) != rownames(wecare_nfe_variants.df))
## [1] 0
sum(rownames(wecare_nfe_genotypes.mx) != rownames(wecare_nfe_kgen.df))
## [1] 0
sum(rownames(wecare_nfe_genotypes.mx) != rownames(wecare_nfe_exac.df))
## [1] 0
# --- wecare only all variants --- #

dim(wecare_genotypes.mx)
## [1] 225375    480
class(wecare_genotypes.mx)
## [1] "matrix"
wecare_genotypes.mx[1:5,1:5]
##              P1_A01 P1_A02 P1_A03 P1_A04 P1_A05
## Var000000002      0      0      0      0      0
## Var000000008      0      0     NA      0     NA
## Var000000009      0     NA     NA      0     NA
## Var000000012      0     NA     NA      0      0
## Var000000013      0     NA      0      0      0
dim(wecare_phenotypes.df)
## [1] 480  28
str(wecare_phenotypes.df)
## 'data.frame':    480 obs. of  28 variables:
##  $ wes_id        : chr  "P1_A01" "P1_A02" "P1_A03" "P1_A04" ...
##  $ gwas_id       : chr  "id203568" "id357807" "id325472" "id304354" ...
##  $ merged_id     : chr  "P1_A01_id203568" "P1_A02_id357807" "P1_A03_id325472" "P1_A04_id304354" ...
##  $ filter        : chr  "pass" "pass" "pass" "pass" ...
##  $ cc            : num  1 1 1 1 1 1 0 1 1 0 ...
##  $ setno         : int  203568 357807 325472 304354 222648 244843 276284 297810 250898 226974 ...
##  $ family_history: int  1 0 1 1 1 1 1 1 1 0 ...
##  $ age_dx        : int  49 35 32 33 44 28 28 38 35 36 ...
##  $ age_ref       : num  58 36 41 34 49 28 32 44 35 38 ...
##  $ rstime        : num  10.17 1.83 9.75 1.59 5.66 ...
##  $ eig1_gwas     : num  -0.00389 -0.00379 -0.01011 -0.01288 -0.01086 ...
##  $ eig2_gwas     : num  0.00266 0.00246 -0.0001 0.00595 0.01157 ...
##  $ eig3_gwas     : num  0.06803 0.05055 -0.00603 0.00747 0.00144 ...
##  $ stage         : num  1 2 2 1 1 1 2 1 2 1 ...
##  $ er            : num  NA 0 0 0 NA 1 1 1 1 0 ...
##  $ pr            : num  NA 0 0 NA NA 1 NA 1 0 0 ...
##  $ hist_cat      : chr  "lobular" "ductal" "medullary" "ductal" ...
##  $ hormone       : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ chemo_cat     : chr  "no" "CMF" "Oth" "no" ...
##  $ br_xray       : int  1 1 0 0 1 0 0 0 1 1 ...
##  $ br_xray_dose  : num  1.6 0.83 0 0 0.77 0 0 0 1.1 0.83 ...
##  $ age_menarche  : num  9 13 10 12 10 13 12 11 11 NA ...
##  $ age_1st_ftp   : num  30 0 26 0 17 0 25 28 27 18 ...
##  $ age_menopause : num  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ num_preg      : num  1 0 1 0 1 0 1 1 1 1 ...
##  $ bmi_age18     : num  20.8 22.5 23.6 18.6 19.9 ...
##  $ bmi_dx        : num  25.8 22.9 28.3 17.8 26.6 ...
##  $ bmi_ref       : num  33.3 22.9 33.1 17.8 29.8 ...
wecare_phenotypes.df[1:5,1:5]
##        wes_id  gwas_id       merged_id filter cc
## P1_A01 P1_A01 id203568 P1_A01_id203568   pass  1
## P1_A02 P1_A02 id357807 P1_A02_id357807   pass  1
## P1_A03 P1_A03 id325472 P1_A03_id325472   pass  1
## P1_A04 P1_A04 id304354 P1_A04_id304354   pass  1
## P1_A05 P1_A05 id222648 P1_A05_id222648   pass  1
dim(wecare_variants.df)
## [1] 225375     26
colnames(wecare_variants.df)
##  [1] "SplitVarID"         "SYMBOL"             "TYPE"              
##  [4] "CHROM"              "POS"                "REF"               
##  [7] "ALT"                "AC"                 "AF"                
## [10] "AN"                 "Consequence"        "SIFT_call"         
## [13] "SIFT_score"         "PolyPhen_call"      "PolyPhen_score"    
## [16] "CLIN_SIG"           "cDNA_position"      "CDS_position"      
## [19] "Codons"             "Protein_position"   "Amino_acids"       
## [22] "Existing_variation" "Multiallelic"       "ac_wecare_cln"     
## [25] "an_wecare_cln"      "af_wecare_cln"
wecare_variants.df[1:5,1:5]
##                SplitVarID    SYMBOL  TYPE CHROM    POS
## Var000000002 Var000000002 LINC00115   SNP     1 762330
## Var000000008 Var000000008    SAMD11   SNP     1 871215
## Var000000009 Var000000009    SAMD11   SNP     1 871230
## Var000000012 Var000000012    SAMD11 INDEL     1 874778
## Var000000013 Var000000013    SAMD11   SNP     1 878664
dim(wecare_kgen.df)
## [1] 225375      9
colnames(wecare_kgen.df)
## [1] "SplitVarID"  "kgen.AC"     "kgen.AN"     "kgen.AF"     "kgen.AFR_AF"
## [6] "kgen.AMR_AF" "kgen.EAS_AF" "kgen.EUR_AF" "kgen.SAS_AF"
wecare_kgen.df[1:5,1:5]
##                SplitVarID kgen.AC kgen.AN     kgen.AF kgen.AFR_AF
## Var000000002 Var000000002      NA      NA          NA          NA
## Var000000008 Var000000008      NA      NA          NA          NA
## Var000000009 Var000000009      NA      NA          NA          NA
## Var000000012 Var000000012      NA      NA          NA          NA
## Var000000013 Var000000013       2    5008 0.000399361           0
dim(wecare_exac.df)
## [1] 225375     48
colnames(wecare_exac.df)
##  [1] "SplitVarID"              "exac_non_TCGA.AF"       
##  [3] "exac_non_TCGA.AC"        "exac_non_TCGA.AN"       
##  [5] "exac_non_TCGA.AC_FEMALE" "exac_non_TCGA.AN_FEMALE"
##  [7] "exac_non_TCGA.AC_MALE"   "exac_non_TCGA.AN_MALE"  
##  [9] "exac_non_TCGA.AC_Adj"    "exac_non_TCGA.AN_Adj"   
## [11] "exac_non_TCGA.AC_Hom"    "exac_non_TCGA.AC_Het"   
## [13] "exac_non_TCGA.AC_Hemi"   "exac_non_TCGA.AC_AFR"   
## [15] "exac_non_TCGA.AN_AFR"    "exac_non_TCGA.Hom_AFR"  
## [17] "exac_non_TCGA.Het_AFR"   "exac_non_TCGA.Hemi_AFR" 
## [19] "exac_non_TCGA.AC_AMR"    "exac_non_TCGA.AN_AMR"   
## [21] "exac_non_TCGA.Hom_AMR"   "exac_non_TCGA.Het_AMR"  
## [23] "exac_non_TCGA.Hemi_AMR"  "exac_non_TCGA.AC_EAS"   
## [25] "exac_non_TCGA.AN_EAS"    "exac_non_TCGA.Hom_EAS"  
## [27] "exac_non_TCGA.Het_EAS"   "exac_non_TCGA.Hemi_EAS" 
## [29] "exac_non_TCGA.AC_FIN"    "exac_non_TCGA.AN_FIN"   
## [31] "exac_non_TCGA.Hom_FIN"   "exac_non_TCGA.Het_FIN"  
## [33] "exac_non_TCGA.Hemi_FIN"  "exac_non_TCGA.AC_NFE"   
## [35] "exac_non_TCGA.AN_NFE"    "exac_non_TCGA.Hom_NFE"  
## [37] "exac_non_TCGA.Het_NFE"   "exac_non_TCGA.Hemi_NFE" 
## [39] "exac_non_TCGA.AC_SAS"    "exac_non_TCGA.AN_SAS"   
## [41] "exac_non_TCGA.Hom_SAS"   "exac_non_TCGA.Het_SAS"  
## [43] "exac_non_TCGA.Hemi_SAS"  "exac_non_TCGA.AC_OTH"   
## [45] "exac_non_TCGA.AN_OTH"    "exac_non_TCGA.Hom_OTH"  
## [47] "exac_non_TCGA.Het_OTH"   "exac_non_TCGA.Hemi_OTH"
wecare_exac.df[1:5,1:5]
##                SplitVarID exac_non_TCGA.AF exac_non_TCGA.AC
## Var000000002 Var000000002        0.0120000              175
## Var000000008 Var000000008               NA               NA
## Var000000009 Var000000009               NA               NA
## Var000000012 Var000000012               NA               NA
## Var000000013 Var000000013        0.0005838               62
##              exac_non_TCGA.AN exac_non_TCGA.AC_FEMALE
## Var000000002            14012                      59
## Var000000008               NA                      NA
## Var000000009               NA                      NA
## Var000000012               NA                      NA
## Var000000013           106198                      17
sum(colnames(wecare_genotypes.mx) != rownames(wecare_phenotypes.df))
## [1] 0
sum(rownames(wecare_genotypes.mx) != rownames(wecare_variants.df))
## [1] 0
sum(rownames(wecare_genotypes.mx) != rownames(wecare_kgen.df))
## [1] 0
sum(rownames(wecare_genotypes.mx) != rownames(wecare_exac.df))
## [1] 0
# --- wecare only common variants --- #

dim(common_wecare_genotypes.mx)
## [1] 44517   480
class(common_wecare_genotypes.mx)
## [1] "matrix"
common_wecare_genotypes.mx[1:5,1:5]
##              P1_A01 P1_A02 P1_A03 P1_A04 P1_A05
## Var000000024      2      1     NA      1      2
## Var000000027      0     NA     NA      1     NA
## Var000000037      0     NA     NA      0     NA
## Var000000054      2      2     NA      2     NA
## Var000000058      2      2      2      2      2
dim(wecare_phenotypes.df) # same as in rare
## [1] 480  28
str(wecare_phenotypes.df)
## 'data.frame':    480 obs. of  28 variables:
##  $ wes_id        : chr  "P1_A01" "P1_A02" "P1_A03" "P1_A04" ...
##  $ gwas_id       : chr  "id203568" "id357807" "id325472" "id304354" ...
##  $ merged_id     : chr  "P1_A01_id203568" "P1_A02_id357807" "P1_A03_id325472" "P1_A04_id304354" ...
##  $ filter        : chr  "pass" "pass" "pass" "pass" ...
##  $ cc            : num  1 1 1 1 1 1 0 1 1 0 ...
##  $ setno         : int  203568 357807 325472 304354 222648 244843 276284 297810 250898 226974 ...
##  $ family_history: int  1 0 1 1 1 1 1 1 1 0 ...
##  $ age_dx        : int  49 35 32 33 44 28 28 38 35 36 ...
##  $ age_ref       : num  58 36 41 34 49 28 32 44 35 38 ...
##  $ rstime        : num  10.17 1.83 9.75 1.59 5.66 ...
##  $ eig1_gwas     : num  -0.00389 -0.00379 -0.01011 -0.01288 -0.01086 ...
##  $ eig2_gwas     : num  0.00266 0.00246 -0.0001 0.00595 0.01157 ...
##  $ eig3_gwas     : num  0.06803 0.05055 -0.00603 0.00747 0.00144 ...
##  $ stage         : num  1 2 2 1 1 1 2 1 2 1 ...
##  $ er            : num  NA 0 0 0 NA 1 1 1 1 0 ...
##  $ pr            : num  NA 0 0 NA NA 1 NA 1 0 0 ...
##  $ hist_cat      : chr  "lobular" "ductal" "medullary" "ductal" ...
##  $ hormone       : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ chemo_cat     : chr  "no" "CMF" "Oth" "no" ...
##  $ br_xray       : int  1 1 0 0 1 0 0 0 1 1 ...
##  $ br_xray_dose  : num  1.6 0.83 0 0 0.77 0 0 0 1.1 0.83 ...
##  $ age_menarche  : num  9 13 10 12 10 13 12 11 11 NA ...
##  $ age_1st_ftp   : num  30 0 26 0 17 0 25 28 27 18 ...
##  $ age_menopause : num  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ num_preg      : num  1 0 1 0 1 0 1 1 1 1 ...
##  $ bmi_age18     : num  20.8 22.5 23.6 18.6 19.9 ...
##  $ bmi_dx        : num  25.8 22.9 28.3 17.8 26.6 ...
##  $ bmi_ref       : num  33.3 22.9 33.1 17.8 29.8 ...
wecare_phenotypes.df[1:5,1:5]
##        wes_id  gwas_id       merged_id filter cc
## P1_A01 P1_A01 id203568 P1_A01_id203568   pass  1
## P1_A02 P1_A02 id357807 P1_A02_id357807   pass  1
## P1_A03 P1_A03 id325472 P1_A03_id325472   pass  1
## P1_A04 P1_A04 id304354 P1_A04_id304354   pass  1
## P1_A05 P1_A05 id222648 P1_A05_id222648   pass  1
dim(common_wecare_variants.df)
## [1] 44517    26
colnames(common_wecare_variants.df)
##  [1] "SplitVarID"         "SYMBOL"             "TYPE"              
##  [4] "CHROM"              "POS"                "REF"               
##  [7] "ALT"                "AC"                 "AF"                
## [10] "AN"                 "Consequence"        "SIFT_call"         
## [13] "SIFT_score"         "PolyPhen_call"      "PolyPhen_score"    
## [16] "CLIN_SIG"           "cDNA_position"      "CDS_position"      
## [19] "Codons"             "Protein_position"   "Amino_acids"       
## [22] "Existing_variation" "Multiallelic"       "ac_wecare_cln"     
## [25] "an_wecare_cln"      "af_wecare_cln"
common_wecare_variants.df[1:5,1:5]
##                SplitVarID SYMBOL TYPE CHROM    POS
## Var000000024 Var000000024  NOC2L  SNP     1 881627
## Var000000027 Var000000027  NOC2L  SNP     1 881918
## Var000000037 Var000000037  NOC2L  SNP     1 889238
## Var000000054 Var000000054 KLHL17  SNP     1 897325
## Var000000058 Var000000058 KLHL17  SNP     1 897564
dim(common_wecare_kgen.df)
## [1] 44517     9
colnames(common_wecare_kgen.df)
## [1] "SplitVarID"  "kgen.AC"     "kgen.AN"     "kgen.AF"     "kgen.AFR_AF"
## [6] "kgen.AMR_AF" "kgen.EAS_AF" "kgen.EUR_AF" "kgen.SAS_AF"
common_wecare_kgen.df[1:5,1:5]
##                SplitVarID kgen.AC kgen.AN   kgen.AF kgen.AFR_AF
## Var000000024 Var000000024    2213    5008 0.4418930      0.0643
## Var000000027 Var000000027     147    5008 0.0293530      0.0038
## Var000000037 Var000000037     283    5008 0.0565096      0.0197
## Var000000054 Var000000054    4303    5008 0.8592250      0.6959
## Var000000058 Var000000058    4507    5008 0.8999600      0.8298
dim(common_wecare_exac.df)
## [1] 44517    48
colnames(common_wecare_exac.df)
##  [1] "SplitVarID"              "exac_non_TCGA.AF"       
##  [3] "exac_non_TCGA.AC"        "exac_non_TCGA.AN"       
##  [5] "exac_non_TCGA.AC_FEMALE" "exac_non_TCGA.AN_FEMALE"
##  [7] "exac_non_TCGA.AC_MALE"   "exac_non_TCGA.AN_MALE"  
##  [9] "exac_non_TCGA.AC_Adj"    "exac_non_TCGA.AN_Adj"   
## [11] "exac_non_TCGA.AC_Hom"    "exac_non_TCGA.AC_Het"   
## [13] "exac_non_TCGA.AC_Hemi"   "exac_non_TCGA.AC_AFR"   
## [15] "exac_non_TCGA.AN_AFR"    "exac_non_TCGA.Hom_AFR"  
## [17] "exac_non_TCGA.Het_AFR"   "exac_non_TCGA.Hemi_AFR" 
## [19] "exac_non_TCGA.AC_AMR"    "exac_non_TCGA.AN_AMR"   
## [21] "exac_non_TCGA.Hom_AMR"   "exac_non_TCGA.Het_AMR"  
## [23] "exac_non_TCGA.Hemi_AMR"  "exac_non_TCGA.AC_EAS"   
## [25] "exac_non_TCGA.AN_EAS"    "exac_non_TCGA.Hom_EAS"  
## [27] "exac_non_TCGA.Het_EAS"   "exac_non_TCGA.Hemi_EAS" 
## [29] "exac_non_TCGA.AC_FIN"    "exac_non_TCGA.AN_FIN"   
## [31] "exac_non_TCGA.Hom_FIN"   "exac_non_TCGA.Het_FIN"  
## [33] "exac_non_TCGA.Hemi_FIN"  "exac_non_TCGA.AC_NFE"   
## [35] "exac_non_TCGA.AN_NFE"    "exac_non_TCGA.Hom_NFE"  
## [37] "exac_non_TCGA.Het_NFE"   "exac_non_TCGA.Hemi_NFE" 
## [39] "exac_non_TCGA.AC_SAS"    "exac_non_TCGA.AN_SAS"   
## [41] "exac_non_TCGA.Hom_SAS"   "exac_non_TCGA.Het_SAS"  
## [43] "exac_non_TCGA.Hemi_SAS"  "exac_non_TCGA.AC_OTH"   
## [45] "exac_non_TCGA.AN_OTH"    "exac_non_TCGA.Hom_OTH"  
## [47] "exac_non_TCGA.Het_OTH"   "exac_non_TCGA.Hemi_OTH"
common_wecare_exac.df[1:5,1:5]
##                SplitVarID exac_non_TCGA.AF exac_non_TCGA.AC
## Var000000024 Var000000024            0.562            59637
## Var000000027 Var000000027            0.045             4735
## Var000000037 Var000000037               NA               NA
## Var000000054 Var000000054               NA               NA
## Var000000058 Var000000058            0.898            70551
##              exac_non_TCGA.AN exac_non_TCGA.AC_FEMALE
## Var000000024           106210                   24874
## Var000000027           106208                    1867
## Var000000037               NA                      NA
## Var000000054               NA                      NA
## Var000000058            78540                    4650
sum(colnames(common_wecare_genotypes.mx) != rownames(wecare_phenotypes.df))
## [1] 0
sum(rownames(common_wecare_genotypes.mx) != rownames(common_wecare_variants.df))
## [1] 0
sum(rownames(common_wecare_genotypes.mx) != rownames(common_wecare_kgen.df))
## [1] 0
sum(rownames(common_wecare_genotypes.mx) != rownames(common_wecare_exac.df))
## [1] 0

save_data

save.image(paste(interim_data_folder, "r05a_calculate_egenvectors_wecare_only_jan2017.RData", sep="/"))

final_section

ls()
##  [1] "common_wecare_eigen"                   
##  [2] "common_wecare_exac.df"                 
##  [3] "common_wecare_genotypes.mx"            
##  [4] "common_wecare_kgen.df"                 
##  [5] "common_wecare_variants.df"             
##  [6] "interim_data_folder"                   
##  [7] "wecare_all_variants_eigenvectors.mx.df"
##  [8] "wecare_eigen"                          
##  [9] "wecare_exac.df"                        
## [10] "wecare_genotypes.mx"                   
## [11] "wecare_kgen.df"                        
## [12] "wecare_nfe_exac.df"                    
## [13] "wecare_nfe_genotypes.mx"               
## [14] "wecare_nfe_kgen.df"                    
## [15] "wecare_nfe_phenotypes.df"              
## [16] "wecare_nfe_variants.df"                
## [17] "wecare_phenotypes.df"                  
## [18] "wecare_variants.df"
sessionInfo()
## R version 3.2.3 (2015-12-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Scientific Linux release 6.8 (Carbon)
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] plotly_4.5.6  ggplot2_2.2.1 dplyr_0.5.0  
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.6       knitr_1.13        magrittr_1.5     
##  [4] munsell_0.4.3     viridisLite_0.1.3 colorspace_1.2-6 
##  [7] R6_2.1.2          httr_1.2.1        stringr_1.0.0    
## [10] plyr_1.8.4        tools_3.2.3       grid_3.2.3       
## [13] gtable_0.2.0      DBI_0.5           htmltools_0.3.5  
## [16] yaml_2.1.13       lazyeval_0.2.0    assertthat_0.1   
## [19] digest_0.6.10     tibble_1.1        tidyr_0.5.1      
## [22] purrr_0.2.2       formatR_1.4       base64enc_0.1-3  
## [25] htmlwidgets_0.8   evaluate_0.9      rmarkdown_1.0    
## [28] labeling_0.3      stringi_1.1.1     scales_0.4.1     
## [31] jsonlite_1.0
Sys.time()
## [1] "2017-02-21 10:24:54 GMT"